{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from fastai.text import * \n",
    "from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# OBTAINING DATA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
      "test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
      "train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)\n"
     ]
    }
   ],
   "source": [
    "# Downloading data\n",
    "!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Archive:  test.csv.zip\n",
      "  inflating: data/test.csv           \n",
      "Archive:  train.csv.zip\n",
      "  inflating: data/train.csv          \n",
      "Archive:  sample_submission.csv.zip\n",
      "  inflating: data/sample_submission.csv  \n"
     ]
    }
   ],
   "source": [
    "# Unzipping data\n",
    "!unzip test.csv.zip -d data/\n",
    "!unzip train.csv.zip -d data/\n",
    "!unzip sample_submission.csv.zip -d data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setting data path\n",
    "DATA_PATH = './data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reading data into dataframe\n",
    "df_train = pd.read_csv(DATA_PATH + '/train.csv')\n",
    "df_test = pd.read_csv(DATA_PATH + '/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using sampling\n",
    "df_train = df_train.head(100)\n",
    "df_test = df_test.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all = pd.concat([df_train, df_test], ignore_index=True, sort=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LANGUAGE MODEL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating databunch with all data for language model and store it\n",
    "databunch_lm = (\n",
    "    TextList.from_df(df=df_all, path=DATA_PATH, cols=['comment_text'])\n",
    "        .split_by_rand_pct(0.1)\n",
    "        .label_for_lm()\n",
    "        .databunch())\n",
    "\n",
    "databunch_lm.save('databunch_lm.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load language model databunch\n",
    "databunch_lm = load_data(DATA_PATH, file='databunch_lm.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fastai/text/learner.py:209: UserWarning: There are no pretrained weights for that architecture yet!\n",
      "  warn(\"There are no pretrained weights for that architecture yet!\")\n"
     ]
    }
   ],
   "source": [
    "# Creating learner with model\n",
    "language_model_learner = language_model_learner(databunch_lm, TransformerXL, drop_mult=0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Finding the best learning rate \n",
    "language_model_learner.unfreeze()\n",
    "language_model_learner.lr_find()\n",
    "language_model_learner.recorder.plot(suggestion=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Total time: 03:10 <p><table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>7.487820</td>\n",
       "      <td>5.899519</td>\n",
       "      <td>0.318973</td>\n",
       "      <td>00:37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>6.807964</td>\n",
       "      <td>5.459869</td>\n",
       "      <td>0.057589</td>\n",
       "      <td>00:37</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>6.298919</td>\n",
       "      <td>4.480237</td>\n",
       "      <td>0.318973</td>\n",
       "      <td>00:38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>5.931885</td>\n",
       "      <td>4.477242</td>\n",
       "      <td>0.318973</td>\n",
       "      <td>00:38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5.718742</td>\n",
       "      <td>4.489619</td>\n",
       "      <td>0.318973</td>\n",
       "      <td>00:38</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Training language model\n",
    "language_model_learner.unfreeze()\n",
    "language_model_learner.fit_one_cycle(5, 1e-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Saving encoder\n",
    "language_model_learner.save_encoder('encoder_transformer_xl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CLASSIFIER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating databunch for classifier using previous vocab and storing it\n",
    "databunch_classifier = (\n",
    "        TextList.from_df(df=df_train, path=DATA_PATH, cols=['comment_text'], vocab=databunch_lm.train_ds.vocab)\n",
    "            .split_by_rand_pct(0.1)\n",
    "            .label_from_df(cols='target')\n",
    "            .add_test(TextList.from_df(df_test, path=DATA_PATH))\n",
    "            .databunch())\n",
    "\n",
    "databunch_classifier.save('databunch_classifier.pkl')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load classifier databunch\n",
    "databunch_classifier = load_data(DATA_PATH, file='databunch_classifier.pkl', bs=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>xxbos xxmaj tuesday xxmaj jan. xxunk , xxunk of \\n  this xxunk ' xxunk ' xxunk to \\n  xxmaj john xxmaj day , xxmaj or to try to xxunk the xxunk \\n  there to come to xxmaj malheur and support them . xxmaj he xxunk to go \\n  without the xxunk of the local \\n  sheriff in xxmaj burns , xxmaj xxunk xxmaj ward .</td>\n",
       "      <td>0.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>xxbos xxmaj tuesday xxmaj jan. xxunk , xxunk of \\n  this xxunk ' xxunk ' xxunk to \\n  xxmaj john xxmaj day , xxmaj or to try to xxunk the xxunk \\n  there to come to xxmaj malheur and support them . xxmaj he xxunk to go \\n  without the xxunk of the local \\n  sheriff in xxmaj burns , xxmaj xxunk xxmaj ward .</td>\n",
       "      <td>0.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>xxbos xxmaj xxunk xxmaj david , we did xxunk all of these xxunk when xxunk our system ! xxmaj in xxunk , we assume people will try to xxunk it ; that 's why we have a lot going on xxunk the xxunk xxunk . xxmaj the xxunk - xxunk xxunk of the xxunk is very xxunk , but there are a lot of xxunk and xxunk on the backend</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>xxbos xxmaj one would hope that the xxunk of xxunk this system is to xxunk more debate and discussion , not less . xxmaj it seems there are xxunk things that xxunk the xxunk of discussion : xxmaj xxunk comments that are xxunk xxunk or xxunk xxunk xxunk ; xxunk xxunk ( which has been a real problem on xxup ww 's comment section , in my xxunk ) ;</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>xxbos xxmaj xxunk xxmaj bundy seems like a nice , xxunk xxunk being who has been xxunk a xxunk of xxunk by his xxunk xxunk xxunk , who he xxunk . xxmaj xxunk has a bunch of other xxunk xxunk xxunk following xxunk as well . xxmaj where ? xxmaj no one xxunk xxunk . xxmaj and , xxunk you xxunk in xxmaj burns no one xxunk . xxmaj xxunk</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Example of a batch\n",
    "databunch_classifier.show_batch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/fastai/text/learner.py:294: UserWarning: There are no pretrained weights for that architecture yet!\n",
      "  warn(\"There are no pretrained weights for that architecture yet!\")\n"
     ]
    }
   ],
   "source": [
    "# Creating text classifier learner\n",
    "classifier_learner = text_classifier_learner(databunch_classifier, TransformerXL, drop_mult=0.05)\n",
    "classifier_learner.load_encoder('encoder_transformer_xl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Finding learning rate\n",
    "classifier_learner.lr_find()\n",
    "classifier_learner.recorder.plot(suggestion=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Total time: 00:15 <p><table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.073298</td>\n",
       "      <td>0.021866</td>\n",
       "      <td>00:15</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Training only classifier last layer\n",
    "classifier_learner.fit_one_cycle(1, 1e-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Finding the best learning rate when unfreezed\n",
    "classifier_learner.unfreeze()\n",
    "classifier_learner.lr_find()\n",
    "classifier_learner.recorder.plot(suggestion=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Total time: 02:55 <p><table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: left;\">\n",
       "      <th>epoch</th>\n",
       "      <th>train_loss</th>\n",
       "      <th>valid_loss</th>\n",
       "      <th>time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.058781</td>\n",
       "      <td>0.021065</td>\n",
       "      <td>00:33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.057206</td>\n",
       "      <td>0.020406</td>\n",
       "      <td>00:35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.059186</td>\n",
       "      <td>0.019833</td>\n",
       "      <td>00:33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.060962</td>\n",
       "      <td>0.019289</td>\n",
       "      <td>00:33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.058467</td>\n",
       "      <td>0.018798</td>\n",
       "      <td>00:38</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Training all classifier\n",
    "classifier_learner.unfreeze()\n",
    "classifier_learner.fit_one_cycle(5, slice(1e-6, 1e-4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save model\n",
    "classifier_learner.save('classifier_model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Export model\n",
    "classifier_learner.export('classifier_transformer_xl.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load model\n",
    "classifier_learner = load_learner(DATA_PATH,'classifier_transformer_xl.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(FloatItem [0.011184], tensor([0.0112]), tensor([0.0112]))"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Example\n",
    "classifier_learner.predict(\"This kid son of a bitch! you are a motherfucker!!! I will kill you!!!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TESTING"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Testing out of fold (validation)\n",
    "oof_preds = classifier_learner.get_preds(ds_type=DatasetType.Valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert predictions to numpy\n",
    "oof_prediction = oof_preds[0].cpu().data.numpy()\n",
    "oof_labels = oof_preds[1].cpu().data.numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Label as toxic when threshold above 0.5\n",
    "oof_labels_booleans = oof_labels>=0.5\n",
    "oof_prediction_booleans = oof_prediction>=0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Accuracy\n",
    "accuracy_score(oof_labels_booleans, oof_prediction_booleans)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ROC-AUC\n",
    "roc_auc_score(oof_labels_booleans,oof_prediction_booleans)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GENERATE SUBMISSION"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate preditions for test set\n",
    "test_preds = classifier_learner.get_preds(ds_type=DatasetType.Test, ordered=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to numpy\n",
    "test_prediction = test_preds[0].cpu().data.numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read submission file\n",
    "submission = pd.read_csv(DATA_PATH + '/sample_submission.csv', index_col='id')\n",
    "submission = submission.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fill submission file with predictions\n",
    "submission['prediction'] = test_prediction[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>prediction</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7000000</th>\n",
       "      <td>0.042214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7000001</th>\n",
       "      <td>0.042214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7000002</th>\n",
       "      <td>0.042214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7000003</th>\n",
       "      <td>0.042214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7000004</th>\n",
       "      <td>0.042214</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         prediction\n",
       "id                 \n",
       "7000000    0.042214\n",
       "7000001    0.042214\n",
       "7000002    0.042214\n",
       "7000003    0.042214\n",
       "7000004    0.042214"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Store submission\n",
    "submission.to_csv('submission.csv')\n",
    "submission.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
